Title: The News that Shifts the Value of Cryptocurrency
Name: Dongming Jin Affiliations: UT Rio Grande Valley, UT Arlington Emails: dongming.jin@utrgv.edu, dongming.jin@mavs.uta.edu
Abstract: Ever since the birth of cryptocurrency, its nature and value has been highly debated. It is the ideal digital asset in the world of internet, that is decentralized and inherently resistant to modification of the ownership. Even though the nature and value has been highly debated, the combination of security and transparency makes it one of the most important innovation in the era of ‘cloud data’, where security is the last shield of privacy. I used Doc2Vec technique to build a semantic model with archived news from the Wall Street Journal and then apply KNN model to classify the topics. The correlation of news from each topic with the same-day value change of BitCoin, one of the major cryptocurrencies, has been explored by RandomForest, in order to predict the event-driven price change.
Cryptocurrency is a concept of digital currency in which encryption techniques are used to regular the generation and verify the transfer of funds based on blockchain, a distributed ledger that is inherently resistant to modification of the data.
%pylab inline
import os
import numpy as np
import pandas as pd
import pickle
import quandl
from datetime import datetime
import seaborn as sns
from collections import Counter
rc={'xtick.labelsize': 30, 'ytick.labelsize': 30, 'axes.labelsize': 30, 'font.size': 30,
'legend.fontsize': 25.0, 'axes.titlesize': 30, "figure.figsize": [30, 10]}
sns.set(rc=rc)
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
def get_quandl_data(quandl_id):
'''Download and cache Quandl dataseries'''
cache_path = 'history/{}.pkl'.format(quandl_id.replace('/','-'))
try:
f = open(cache_path, 'rb')
df = pickle.load(f)
print('Loaded {} from cache'.format(quandl_id))
except (OSError, IOError) as e:
print('Downloading {} from Quandl'.format(quandl_id))
df = quandl.get(quandl_id.replace('history/',''), returns="pandas")
df.to_pickle(cache_path)
print('Cached {} at {}'.format(quandl_id, cache_path))
return df
# Pull Kraken BTC price exchange data
btc_usd_price_kraken = get_quandl_data('BCHARTS/KRAKENUSD')
btc_usd_price_kraken.head()
# Chart the BTC pricing data
btc_trace = go.Scatter(x=btc_usd_price_kraken.index, y=btc_usd_price_kraken['Weighted Price'])
py.iplot([btc_trace])
# Pull pricing data for 3 more BTC exchanges
exchanges = ['COINBASE','BITSTAMP','ITBIT']
exchange_data = {}
exchange_data['KRAKEN'] = btc_usd_price_kraken
for exchange in exchanges:
exchange_code = 'BCHARTS/{}USD'.format(exchange)
btc_exchange_df = get_quandl_data(exchange_code)
exchange_data[exchange] = btc_exchange_df
def merge_dfs_on_column(dataframes, labels, col):
'''Merge a single column of each dataframe into a new combined dataframe'''
series_dict = {}
for index in range(len(dataframes)):
series_dict[labels[index]] = dataframes[index][col]
return pd.DataFrame(series_dict)
# Merge the BTC price dataseries' into a single dataframe
btc_usd_datasets = merge_dfs_on_column(list(exchange_data.values()), list(exchange_data.keys()), 'Weighted Price')
btc_usd_datasets.tail()
# Merge the BTC volume dataseries' into a single dataframe
btc_vol_datasets = merge_dfs_on_column(list(exchange_data.values()), list(exchange_data.keys()), 'Volume (BTC)')
# convert into USD-based volume
btc_vol_datasets = btc_vol_datasets * btc_usd_datasets
def df_scatter(df, title, seperate_y_axis=False, y_axis_label='', scale='linear', initial_hide=False):
'''Generate a scatter plot of the entire dataframe'''
label_arr = list(df)
series_arr = list(map(lambda col: df[col], label_arr))
layout = go.Layout(
title=title,
legend=dict(orientation="h"),
xaxis=dict(type='date'),
yaxis=dict(
title=y_axis_label,
showticklabels= not seperate_y_axis,
type=scale
)
)
y_axis_config = dict(
overlaying='y',
showticklabels=False,
type=scale )
visibility = 'visible'
if initial_hide:
visibility = 'legendonly'
# Form Trace For Each Series
trace_arr = []
for index, series in enumerate(series_arr):
trace = go.Scatter(
x=series.index,
y=series,
name=label_arr[index],
visible=visibility
)
# Add seperate axis for the series
if seperate_y_axis:
trace['yaxis'] = 'y{}'.format(index + 1)
layout['yaxis{}'.format(index + 1)] = y_axis_config
trace_arr.append(trace)
fig = go.Figure(data=trace_arr, layout=layout)
py.iplot(fig)
# Plot all of the BTC exchange prices
df_scatter(btc_usd_datasets, 'Bitcoin Price (USD) By Exchange')
# Remove "0" values
btc_usd_datasets.replace(0, np.nan, inplace=True)
btc_vol_datasets.replace(0, np.nan, inplace=True)
# Plot the revised dataframe
df_scatter(btc_usd_datasets, 'Bitcoin Price (USD) By Exchange')
# Plot the revised dataframe, rolling average
btc_vol_datasets.rolling(14).mean().plot(logy=True)
title("Bitcoin 14 days rolling Volume in USD")
# savefig('fig/rolling_vol.jpeg', dpi=200, bbox_inches='tight')
# Calculate the average BTC price as a new column
btc_usd_datasets['avg_btc_price_usd'] = btc_usd_datasets.mean(axis=1)
btc_vol_datasets['tot_btc_vol_usd'] = btc_vol_datasets.sum(axis=1)
# Plot the average BTC price
btc_trace = go.Scatter(x=btc_usd_datasets.index, y=btc_usd_datasets['avg_btc_price_usd'])
py.iplot([btc_trace])
# Plot the average BTC volume
btc_vol_datasets.tot_btc_vol_usd.rolling(14).mean().plot(logy=True)
btc_usd_datasets.describe()
btc_vol_datasets.describe()
start_date = pd.datetime(2017,1,1)
end_date = pd.datetime(2017,12,31)
btc_usd_datasets.avg_btc_price_usd.pct_change().rolling(14).mean().plot()
xlim([start_date, end_date])
btc_vol_datasets.tot_btc_vol_usd.rolling(14).mean().pct_change().plot(logy=True)
xlim([start_date, end_date])
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
kw_list = ["Blockchain","BTC","hack"]
pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='')
pytrends.interest_over_time().plot()
def get_json_data(json_url, cache_path):
'''Download and cache JSON data, return as a dataframe.'''
try:
f = open(cache_path, 'rb')
df = pickle.load(f)
print('Loaded {} from cache'.format(json_url))
except (OSError, IOError) as e:
print('Downloading {}'.format(json_url))
df = pd.read_json(json_url)
df.to_pickle(cache_path)
print('Cached {} at {}'.format(json_url, cache_path))
return df
base_polo_url = 'https://poloniex.com/public?command=returnChartData¤cyPair={}&start={}&end={}&period={}'
start_date = datetime.strptime('2015-01-01', '%Y-%m-%d') # get data from the start of 2015
end_date = datetime.now() # up until today
pediod = 86400 # pull daily data (86,400 seconds per day)
def get_crypto_data(poloniex_pair):
'''Retrieve cryptocurrency data from poloniex'''
json_url = base_polo_url.format(poloniex_pair.replace('history/',''), start_date.timestamp(), end_date.timestamp(), pediod)
data_df = get_json_data(json_url, poloniex_pair)
data_df = data_df.set_index('date')
return data_df
altcoins = ['ETH','LTC','XRP','ETC','STR','DASH','SC','XMR','XEM']
altcoin_data = {}
for altcoin in altcoins:
coinpair = 'history/BTC_{}'.format(altcoin)
crypto_price_df = get_crypto_data(coinpair)
altcoin_data[altcoin] = crypto_price_df
# Calculate USD Price as a new column in each altcoin dataframe
for altcoin in altcoin_data.keys():
altcoin_data[altcoin]['price_usd'] = altcoin_data[altcoin]['weightedAverage'] * btc_usd_datasets['avg_btc_price_usd']
# Merge USD price of each altcoin into single dataframe
combined_df = merge_dfs_on_column(list(altcoin_data.values()), list(altcoin_data.keys()), 'price_usd')
# Add BTC price to the dataframe
combined_df['BTC'] = btc_usd_datasets['avg_btc_price_usd']
# Chart all of the altocoin prices
df_scatter(combined_df, 'Cryptocurrency Prices (USD)', seperate_y_axis=False, y_axis_label='Coin Value (USD)', scale='log')
# Calculate the pearson correlation coefficients for cryptocurrencies in 2016
combined_df_2016 = combined_df[combined_df.index.year == 2016]
combined_df_2016.pct_change().corr(method='pearson')
def correlation_heatmap(df, title, absolute_bounds=True):
'''Plot a correlation heatmap for the entire dataframe'''
heatmap = go.Heatmap(
z=df.corr(method='pearson').as_matrix(),
x=df.columns,
y=df.columns,
colorbar=dict(title='Pearson Coefficient'),
)
layout = go.Layout(title=title)
if absolute_bounds:
heatmap['zmax'] = 1.0
heatmap['zmin'] = -1.0
fig = go.Figure(data=[heatmap], layout=layout)
py.iplot(fig)
correlation_heatmap(combined_df_2016.pct_change(), "Cryptocurrency Correlations in 2016")
combined_df_2017 = combined_df[combined_df.index.year == 2017]
combined_df_2017.pct_change().corr(method='pearson')
correlation_heatmap(combined_df_2017.pct_change(), "Cryptocurrency Correlations in 2017")
# figure(figsize=(16,16))
imshow(combined_df_2017.pct_change().corr(), cmap='viridis')
title("Cryptocurrency Correlations in 2017")
colorbar()
xticks(arange(10), combined_df_2017.columns, rotation=90)
yticks(arange(10), combined_df_2017.columns)
savefig('fig/altcoin.jpeg', dpi=200, bbox_inches='tight')
combined_df_2017.columns
import requests
from bs4 import BeautifulSoup
def archive_link(ts):
return "http://www.wsj.com/public/page/archive-%s.html" % ts.date().isoformat()
def news_archive(ts):
news = []
link = archive_link(ts)
response = requests.get(link)
assert response.ok, link
soup = BeautifulSoup(response.text, "html5lib")
pieces = soup.find("ul", {"class":"newsItem"}).find_all('li')
for piece in pieces:
title = piece.find('h2').text
url = piece.find('a')['href']
content = piece.find('p').text
# _, title2, content = piece.find('p').text.splitlines()
news.append([ts, title, content, url])
return news
# return pd.DataFrame(news, columns=['date','title', 'content', 'url'])
start_date = pd.datetime(2017,1,1)
end_date = pd.datetime(2017,12,31)
btc_usd_datasets.index
all_news = pd.read_csv('data/wsj_archive.gz', parse_dates=['date'])
if start_date.date() < all_news.date[0].date() or (end_date.date() > all_news.date.iloc[-1].date()):
updates = []
for date in set(pd.date_range(start=start_date, end=end_date, freq='D')) - set(all_news.date):
updates += news_archive(date)
updates = pd.DataFrame(updates, columns=['date','title', 'content', 'url'])
all_news.append(updates).sort_values('date').to_csv('data/wsj_archive.gz', index=None, compression='gzip')
import sqlite3
conn = sqlite3.connect("data/wsj_news.db")
cur = conn.cursor()
try:
updates
except:
pass
else:
updates.to_sql('archive', conn, if_exists='append', index=False)
sql_news = pd.read_sql_query("select * from archive;", conn, parse_dates=['date'])
cur.close()
conn.close()
import json
import requests
class v1:
key = None
def __init__(self, key):
self.key = key
def holidays(self, parameters):
url = 'https://holidayapi.com/v1/holidays?'
if 'key' not in parameters.keys():
parameters['key'] = self.key
response = requests.get(url, params=parameters);
data = json.loads(response.text)
if response.status_code != 200:
if data.has_key('error') is False:
data['error'] = 'Unknown error.'
return data
hapi = v1("e8fe3d7c-d22c-4809-89a4-8fbecbc1eba9")
parameters = {
# Required
'country': 'US',
'year': 2017,
# Optional
# 'month': 7,
# 'day': 4,
# 'previous': True,
# 'upcoming': True,
# 'public': True,
# 'pretty': True,
}
# US_hdays = hapi.holidays(parameters)
# US_hdates = pd.DataFrame.from_dict(pd.DataFrame.from_dict(US_hdays.get('holidays'), orient='index').loc[:,0].to_dict(), orient='index').sort_values('date')
# US_hdates.index = pd.to_datetime(US_hdates.index)
# US_hdates.to_csv('data/us_holidays.gz', index=None, compression='gzip')
parameters = {
# Required
'country': 'CN',
'year': 2017,
# Optional
# 'month': 7,
# 'day': 4,
# 'previous': True,
# 'upcoming': True,
# 'public': True,
# 'pretty': True,
}
# CN_hdays = hapi.holidays(parameters)
# CN_hdates = pd.DataFrame.from_dict(pd.DataFrame.from_dict(CN_hdays.get('holidays'), orient='index').loc[:,0].to_dict(), orient='index')
# CN_hdates.index = pd.to_datetime(CN_hdates.index)
# CN_hdates.to_csv('data/cn_holidays.gz', index=None, compression='gzip')
US_hdates = pd.read_csv('data/us_holidays.gz')
CN_hdates = pd.read_csv('data/cn_holidays.gz')
03/01/2018 version
from 2011-09-13 to 2012-09-12 67579
03/02/2018 version
30094 2012-02-24 The Surveillance Catalog NaN
http://www.wsj.com/articles/SB1000142405297020...
from 2017-01-01 to 2017-12-31
10593 2017-01-08 Test Your Smarts on the ‘Star Wars’ Market, Ra... NaN http://www.wsj.com/articles/test-your-smarts-o...
format the content
remove '\n' and extra space
remove duplicated: 1294 -> 1995
count 1294 -> 1995
unique 226 -> 72
top Pepper...and Salt Pepper...and Salt -> ...
freq 306 -> 687
entries: 66284 -> 43268
all_news = pd.read_csv('data/wsj_archive.gz', parse_dates=['date'])
all_news.shape
news = all_news[(all_news.date>=start_date) & (all_news.date<=end_date)].sort_values('date').reset_index(drop=True)
# drop NaN value
news.dropna(inplace=True)
# format the content
news.content = news.content.str.split().map(lambda x: ' '.join(x))
# check duplicates
news.content[news.content.duplicated()].describe()
# drop duplicated
news.drop_duplicates(subset=['content'], keep='first', inplace=True)
## change first to False and redo wiki and ap models
news = news.reset_index(drop=True)
# news.content.to_csv('data/wsj_content', index=None, sep='\n')
print(news.shape)
# # generate polarity and subjectivity with TextBlob
# from textblob import TextBlob
# news['polarity'] = news.apply(lambda x: TextBlob(x['content']).sentiment.polarity, axis=1)
# news['subjectivity'] = news.apply(lambda x: TextBlob(x['content']).sentiment.subjectivity, axis=1)
# news.loc[:, ['polarity','subjectivity']].to_csv('data/sentiment.gz', index=None, compression='gzip')
news = pd.concat([news, pd.read_csv('data/sentiment.gz')], axis=1)
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
# from sklearn.decomposition import NMF
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# # set of stopwords
# STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
# STOPLIST = list(STOPLIST)
# tfidf_vectorizer = TfidfVectorizer(max_df = 0.4, min_df = 15,
# stop_words = STOPLIST)
# tfidf = tfidf_vectorizer.fit_transform(news.content)
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# nmf = NMF(n_components = 30, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
# tfidf.shape
# transformed_data = nmf.transform(tfidf)
# transformed_data.shape
# def zero_equals_1(x):
# if x != 0:
# return 1
# else:
# return x
# transformed_data = pd.DataFrame(transformed_data)
# article_topics = transformed_data.applymap(zero_equals_1)
# article_topics["Date"] = news.date
# article_topics["Article_Num"] = range(1, len(article_topics) + 1)
# article_topics.columns = article_topics.columns.astype('str')
# # melted_df = pd.melt(article_topics, id_vars = ["Article_Num", "Date"], var_name = "Topic_Num")
# def print_top_words(model, feature_names, n_top_words):
# n_article = article_topics.drop('Article_Num', axis=1).sum(axis=0)
# for topic_idx, topic in enumerate(model.components_):
# print("Topic #%d; Number of Article: %d" % (topic_idx, n_article[topic_idx]))
# print(" ".join([feature_names[i]
# for i in topic.argsort()[:-n_top_words - 1:-1]]))
# print()
# print_top_words(nmf, tfidf_feature_names, 20)
Topic #0; Number of Article: 9022
russia election russian presidential senate putin probe campaign 2016 french sanctions intelligence party fbi investigation moscow comey interference political committee
Topic #1; Number of Article: 1086
corrections amplifications edition 2017 jan oct aug sept nov july june april march feb dec 24 26 28 17 21
Topic #2; Number of Article: 5066
gerard baker editor scoops guided stories tour point personal 10 journal wall chief best day street trump travel ban latest
Topic #3; Number of Article: 10989
trump president donald administration obama immigration order policy said elect ban mr travel secretary campaign office agenda says washington leaders
Topic #4; Number of Article: 8486
new york jersey cuomo gov art andrew times office book manhattan authority study way research zealand fiction museum shows yorkers
Topic #5; Number of Article: 4087
news media marketing advertising cmo roundup happenings today morning industry biggest ad facebook fox ads google tv digital snap youtube
Topic #6; Number of Article: 5605
tax plan overhaul gop reform republicans senate code taxes corporate cut republican cuts congress income rate pass businesses proposal budget
Topic #7; Number of Article: 3489
review outlook reform trump obama reviews obamacare gop good political labor democrats britain rule film gigot editorial needs putin growth
Topic #8; Number of Article: 5819
oil prices opec production crude energy output gas saudi week cuts rose supply drilling producers global rig rigs count stockpiles
Topic #9; Number of Article: 3836
korea north south korean missile nuclear pyongyang sanctions kim military jong threat tensions ballistic test weapons defense seoul japan launch
Topic #10; Number of Article: 8491
fed federal reserve rates rate inflation yellen raise economy policy meeting year term janet minutes officials increase short increases chairwoman
Topic #11; Number of Article: 3572
photos day editors wall journal selected street chosen sunday thursday friday tuesday wednesday monday march aug celebrate dec june india
Topic #12; Number of Article: 5389
china chinese beijing xi world jinping economy global yuan foreign market growth largest sea country hong kong communist alibaba economic
Topic #13; Number of Article: 10620
quarter earnings profit sales revenue watch growth results scheduled market fourth report second year reported closes expected fiscal know need
Topic #14; Number of Article: 5321
health care gop act affordable senate republicans insurance republican repeal law vote insurers obamacare senators medicaid overhaul legislation democrats replace
Topic #15; Number of Article: 7618
dollar gold prices data rose higher fell copper month weaker investors high level falls metals edged week gains friday lower
Topic #16; Number of Article: 8347
state islamic tillerson syria forces iraq military said iraqi attack officials killed militants secretary rex budget isis mosul california gov
Topic #17; Number of Article: 6368
brexit european eu union minister prime theresa britain europe talks british bloc negotiations deal brussels macron london political pound leaders
Topic #18; Number of Article: 6487
billion debt deal treasury buy sell auction comprising fund sold securities department previously agreed equity week firm private group stake
Topic #19; Number of Article: 11284
investors stocks market stock markets shares funds dow rally bond 500 companies fund gains emerging trading year global indexes industrial
Topic #20; Number of Article: 2591
house white republicans republican security adviser director staff gop said committee flynn ryan communications senior home rep mike press aide
Topic #21; Number of Article: 4742
best books week web selling ended data bookscan npd children gurdon meghan cox author fiction reviews sacks tom sam mysteries
Topic #22; Number of Article: 31490
company chief business executive said ceo world financial companies year firm uber group people plans says technology years car maker
Topic #23; Number of Article: 4528
court supreme judge appeals gorsuch case ban ruling travel justice federal neil ruled law bankruptcy order legal nominee justices judges
Topic #24; Number of Article: 7688
million home estate real property pay homes lists settle asks funding manhattan 100 raised agreed startup square beach raises housing
Topic #25; Number of Article: 5761
street heard wall big banks apple growth good investors main drug chip tech needs price looks tesla makers markets industry
Topic #26; Number of Article: 4747
trade free mexico canada nafta american deficit agreement surplus exports talks global imports pact administration economic canadian pacific mexican economy
Topic #27; Number of Article: 6143
bank central ecb banks policy european rate inflation monetary eurozone stimulus deutsche financial england canada bond japan draghi rates economy
Topic #28; Number of Article: 4599
government bonds bond yields treasurys prices yield strengthened debt note year pulled inflation strengthen treasury selloff data assets demand 10
Topic #29; Number of Article: 5941
city york mayor police blasio nyc council officials schools man department school mayoral public officers said housing bronx people island
Interested topics:
28 government bonds
[0, 3, 5, 6, 8 ,9, 10, 12, 15, 16, 17, 19, 24, 25, 26, 27, 28]
Nonsense topics
1, 11, 13, 18, 21, 23, 29
# article_topics.to_csv('data/to_topics.gz', index=None, compression='gzip')
article_topics = pd.read_csv('data/to_topics.gz', parse_dates=['Date'])
article_topics.shape
article_topics.keys() # 0:29 one-hot topic representation
# correlation_heatmap(article_topics.loc[:, [str(i) for i in range(30)]],'')
No good result
def senti(df, method = 'mean', roll_win = 1, feat = 'sentiment'):
if method == 'median':
return df.groupby(['date']).median()[feat].rolling(roll_win).median()
elif method == 'sum':
return df.groupby(['date']).sum()[feat].rolling(roll_win).mean()
else:
return df.groupby(['date']).mean()[feat].rolling(roll_win).mean()
topics = pd.DataFrame()
for t_id in range(30):
# print(t_id, t_name)
topic_id = [str(t_id)]
one_topic = news[article_topics.loc[:,topic_id].sum(axis=1)>0]
topics[str(t_id)] = senti(one_topic, feat='polarity')
topics.replace(np.nan, 0, inplace=True)
correlation_heatmap(topics, 'Topic Correlation')
topic_corr = (topics.corr())
mask = (topic_corr<0.01) & (topic_corr>-0.01)
pairs = np.where(mask)
Counter(pairs[0])
pairs
# mask = (topic_corr<0.01) & (topic_corr>-0.01)
hi_pairs = np.where((topic_corr<1) & (topic_corr>0.5))
Counter(hi_pairs[0])
fig = figure(figsize=(8,8))
scatter(pairs[0], pairs[1], marker='x', s=100, c='r')#, vmin=min(wiki_Tcorr.min()))
scatter(hi_pairs[0], hi_pairs[1], marker='*', s=100, c='b')#, vmin=min(wiki_Tcorr.min()))
legend(['abs(corr) < 0.01', 'abs(corr) > 0.5'])
imshow(topics.corr())
colorbar()
title("Pearson Correlation between modeled Topics")
# savefig('fig/topic_corr.jpeg', dpi=200, bbox_inches='tight')
# # pd.melt: one-hot rows to cells
# melted_df.shape
# melted_df.Topic_Num.describe()
# import calendar
# def extract_month_year(date):
# month = date.month
# month_name = calendar.month_name[month]
# year = date.year
# return month_name + " " + str(year)
# melted_df["Month_Year"] = melted_df["Date"].apply(lambda x: extract_month_year(x))
# # melted_df.groupby("Topic_Num").sum().sort_values("value").reset_index()
# bar(arange(30),article_topics.drop('Article_Num',axis=1).sum(axis=0))
# title("Number of Article per Topic")
topic_list = ['0', '3', '5', '6', '8', '9', '10', '12', '15',
'16', '17', '19', '24', '25', '26', '27', '28']
# melted_df = melted_df[melted_df["Topic_Num"].isin(topic_list)]
# exam news by topic
topic_id = ['9']
# news.content[(melted_df.value[melted_df.Topic_Num == topic_id]==1.0).values].sample(5) # same
news.content[article_topics.loc[:,topic_id].sum(axis=1)>0].sample(5)
topic_num_to_name = {'0' : "Russia Hack Election",
'3' : "President Trump ",
'5' : "FANG Tech Company",
'6' : "Tax Plan Reform",
'8' : "Crude Oil",
'9' : "North Korea Nuclear",
'10' : "Federal Inflation Rate",
'12' : "Chinese Economy",
'15' : "Dollar & Metal",
'16' : "ISIS",
'17' : "Brexit in EU",
'19' : "SP500 Maket",
'24' : "US Housing",
'25' : "Maker Industry",
'26' : "North American",
'27' : "EU Eurozone",
'28' : "Federal Bonds"}
topic_num_to_name = pd.DataFrame.from_dict(topic_num_to_name, orient = "index").reset_index()
topic_num_to_name.columns = ["Topic_Num", "Topic_Name"]
# final_df = melted_df.merge(topic_num_to_name, on = "Topic_Num")
overall_topics = pd.DataFrame({"Security": ['0', '9', '16', "NaN", "NaN", "NaN"],
"Goverment": ['3', '6', '10', '17', '28', "NaN"],
"Market": ['5', '8', '15', '19', '24', '25'],
"Global": ['12', '26', '27', "NaN", "NaN", "NaN"]})
# fig = figure(figsize=(12,8))
barh(arange(30),article_topics.drop('Article_Num',axis=1).sum(axis=0))
for i,v in enumerate(article_topics.drop('Article_Num',axis=1).sum(axis=0)):
if str(i) in topic_num_to_name.Topic_Num.values:
text(v+50, i-0.3, dict(topic_num_to_name.values).get(str(i)), fontweight='bold', fontsize=20)
title("Number of Article per Topic")
# savefig("fig/perTopic.jpeg", dpi=200, bbox_inches='tight')
# melted_ot = pd.melt(overall_topics)
# melted_ot = melted_ot[melted_ot["value"] != "NaN"]
# melted_ot.columns = ["Overall Topic", "Topic_Num"]
# final_df = final_df.merge(melted_ot, on = "Topic_Num")
# final_df = final_df.groupby(["Month_Year", "Topic_Num", "Topic_Name", "Overall Topic"]).sum().reset_index()
# final_df = final_df.drop('Article_Num', axis=1).rename(columns={'Topic_Name': 'Topic Name'})
# day_df = melted_df.merge(topic_num_to_name, on = "Topic_Num")
# day_df = day_df.merge(melted_ot, on = "Topic_Num")
# day_df = day_df.groupby(["Date", "Topic_Num", "Topic_Name", "Overall Topic"]).sum().reset_index()
# day_df = day_df.drop('Article_Num', axis=1).rename(columns={'Topic_Name': 'Topic Name'})
# final_df.to_csv('data/month_topic.gz', index=None, compression='gzip')
# day_df.to_csv('data/day_topic.gz', index=None, compression='gzip')
final_df = pd.read_csv('data/month_topic.gz')
day_df = pd.read_csv('data/day_topic.gz')
order = ["January 2017", "February 2017", "March 2017", "April 2017", "May 2017", "June 2017", "July 2017",
"August 2017", "September 2017", "October 2017","November 2017", "December 2017" ]
one_topic = final_df[final_df["Overall Topic"] == "Security"]
g = sns.pointplot(ci = None, x = one_topic["Month_Year"],
y = one_topic["value"].pct_change(), hue = one_topic["Topic Name"])
plt.xticks(rotation=15)
g.set(xlabel = "Month", ylabel = "Percentage change of Articles", title = "WSJ Articles on Security Issues")
# plt.savefig("fig/Security_Issues.jpeg", dpi=200, bbox_inches='tight')
order = ["January 2017", "February 2017", "March 2017", "April 2017", "May 2017", "June 2017", "July 2017",
"August 2017", "September 2017", "October 2017","November 2017", "December 2017" ]
one_topic = final_df[final_df["Overall Topic"] == "Goverment"]
g = sns.pointplot(ci = None, x = one_topic["Month_Year"],
y = one_topic["value"], order = order, hue = one_topic["Topic Name"])
plt.xticks(rotation=15)
g.set(xlabel = "Month", ylabel = "Number of Articles", title = "WSJ Articles on Governmental Issues")
# plt.savefig("fig/Governmental_Issues.jpeg", dpi=200, bbox_inches='tight')
order = ["January 2017", "February 2017", "March 2017", "April 2017", "May 2017", "June 2017", "July 2017",
"August 2017", "September 2017", "October 2017","November 2017", "December 2017" ]
one_topic = final_df[final_df["Overall Topic"] == "Market"]
g = sns.pointplot(ci = None, x = one_topic["Month_Year"],
y = one_topic["value"], order = order, hue = one_topic["Topic Name"])
plt.xticks(rotation=15)
g.set(xlabel = "Month", ylabel = "Number of Articles", title = "WSJ Articles on Market Issues")
# plt.savefig("fig/Market_Issues.jpeg", dpi=200, bbox_inches='tight')
order = ["January 2017", "February 2017", "March 2017", "April 2017", "May 2017", "June 2017", "July 2017",
"August 2017", "September 2017", "October 2017","November 2017", "December 2017" ]
one_topic = final_df[final_df["Overall Topic"] == "Global"]
g = sns.pointplot(ci = None, x = one_topic["Month_Year"],
y = one_topic["value"], order = order, hue = one_topic["Topic Name"])
plt.xticks(rotation=15)
g.set(xlabel = "Month", ylabel = "Number of Articles", title = "WSJ Articles on Global Issues")
# plt.savefig("fig/Global_Issues.jpeg", dpi=200, bbox_inches='tight')
topic_num_to_name.get('0')
# average amount of news vs security news
fig = figure()
ax = fig.add_subplot(111)
for topic_id in ['0', '9', '16']:
news[article_topics.loc[:,topic_id]==1].groupby('date').count().resample('14D').mean().title.plot(
ax=ax, label=topic_num_to_name.get(topic_id))
legend()
title("Wall Street Journal Articles on Security Topics")
ylabel("Average Number of Articles per Day")
# xticks(rotation=15)
ax2 = ax.twinx()
news.groupby('date').count().resample('14D').mean().title.plot(c='k',
alpha=0.5, ax=ax2)
# average amount of news vs security news
fig = figure()
ax = fig.add_subplot(111)
# t0 = news[article_topics.loc[:,'0']==1].groupby('date').count().resample('14D').mean().title
# t9 = news[article_topics.loc[:,'9']==1].groupby('date').count().resample('14D').mean().title
# t16 = news[article_topics.loc[:,'16']==1].groupby('date').count().resample('14D').mean().title
# wsj = news.groupby('date').count().resample('14D').mean().title
width = 7
bar(t0.index, t0.values, width, label=topic_num_to_name.get('0'))
bar(t9.index, t9.values, width, bottom=t0.values, label=topic_num_to_name.get('9'))
bar(t16.index, t16.values, width, bottom=t9.values, label=topic_num_to_name.get('16'))
t_all= t0.values+t9.values+t16.values
legend()
title("Wall Street Journal Articles on Security Topics")
ylabel("Average Number of Articles per Day")
# xticks(wsj.index,rotation=15)
# ax2 = ax.twinx()
for i in range(len(t_all)):
ax.text(wsj.index[i]-pd.Timedelta('7d'), t16.values[i], s="%.1f %%" % (t_all[i]/wsj.values[i]*100))
# ax2.ylabel("Percentage of all Articles")
# plot(wsj.index, wsj.values, c='k', alpha=0.5)
# savefig('fig/news_over_time.jpeg', dpi=200, bbox_inches='tight')
news.describe()
news.sample(5).loc[:, ['date','title','polarity','subjectivity']]
sns.jointplot(x='polarity', y='subjectivity', data=news)
pd.options.display.max_colwidth = 200
news[news.polarity < np.percentile(news.polarity, 1)].content.sample(3)
news[news.polarity > np.percentile(news.polarity, 99)].content.sample(3)
news['sentiment'] = news.polarity.map(lambda x: x if x==0 else x/abs(x))
news.groupby(['date']).mean().sentiment.plot()
topic_id = ['0', '3', '5']
security = news[article_topics.loc[:,topic_id].sum(axis=1)>0]
# security['sentiment'] = security.polarity.map(lambda x: x if x==0 else x/abs(x) )
security.groupby(['date']).median().sentiment.plot()
# median shows the major polarity in the day
a = pd.DataFrame(news.date.value_counts(sort=False))
fig = figure()
ax = fig.add_subplot(111)
a.resample('1D').mean().plot(alpha=0.1, legend=False, title='Average amount of News', ax=ax)
a.resample('7D').mean().plot(alpha=0.5, legend=False, title='Average amount of News', ax=ax)
a.resample('1M').mean().plot(legend=False, title='Average amount of News', ax=ax)
ax.legend(['per day','per week','per month'])
figure()
a.groupby(a.index.dayofweek).mean().plot(kind='bar',
legend=False,
title='Average amount of News per day of the week',
)
xticks(arange(7), ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), rotation=15)
# BTC news
key_word= "([Bb]it[Cc]oin)|([Bb]lock[-]?chain)|([Hh]ack)"
b = news.content[news.content.str.contains(key_word)]
print(b.shape, '\n', b.sample(5))
# key_word= "([Bb]it[Cc]oin)|([Bb]lock[-]?chain)"
b = news[news.content.str.contains(key_word)]
# b = pd.DataFrame(b.date.value_counts(sort=False))
b = pd.DataFrame(b.groupby('date').sum().polarity)
fig = figure()
ax = fig.add_subplot(111)
b.resample('1D').mean().plot(alpha=0.8, legend=False, title='Average amount of News', ax=ax)
b.resample('7D').mean().plot(alpha=0.5, legend=False, title='Average amount of News', ax=ax)
b.resample('1M').mean().plot(alpha = 0.2, legend=False, title='Average amount of News', ax=ax)
security.groupby('date').median().polarity.resample('1D').mean().plot(ax=ax)
ax.legend(['per day','per week','per month','security'])
figure()
b.groupby(b.index.dayofweek).mean().plot(kind='bar',
legend=False,
title='Average amount of News per day of the week',
)
xticks(arange(7), ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), rotation=15)
btc_2017 = btc_usd_datasets[(btc_usd_datasets.index >= start_date) & (btc_usd_datasets.index <= end_date )].loc[:, ['avg_btc_price_usd']]
btc_2017['vol'] = btc_vol_datasets[(btc_vol_datasets.index >= start_date) & (btc_vol_datasets.index <= end_date )].loc[:, ['tot_btc_vol_usd']]
btc_2017.columns = ['Price', 'Volume']
c = btc_2017.Price
d = btc_2017.Volume
fig = figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
c.resample('1D').mean().plot(alpha=0.1, legend=False, ax=ax)
c.resample('7D').mean().plot(alpha=0.5, legend=False, ax=ax)
c.resample('1M').mean().plot(legend=False, ax=ax)
ax.legend(['day Price','7 day Price','30 day Price'], loc=0)
ax.semilogy()
d.resample('1D').mean().plot(c='k', alpha=0.1, legend=False, ax=ax2)
d.resample('7D').mean().plot(c='k', alpha=0.5, legend=False, ax=ax2)
d.resample('1M').mean().plot(c='k', legend=False, ax=ax2)
ax2.legend(['day Volume','7 day Volume','30 day Volume'], loc=4)
ax2.semilogy()
title("2017 Bitcoin Price/Volume in USD")
# savefig('fig/BTC.jpeg', dpi=200, bbox_inches='tight')
figure()
d.groupby(d.index.dayofweek).mean().plot(kind='bar',
legend=False,
title='Average amount of Volume per day of the week',
)
xticks(arange(7), ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), rotation=15)
def senti(df, method = 'mean', roll_win = 1, feat = 'sentiment'):
if method == 'median':
return df.groupby(['date']).median()[feat].rolling(roll_win).median()
elif method == 'sum':
return df.groupby(['date']).sum()[feat].rolling(roll_win).mean()
else:
return df.groupby(['date']).mean()[feat].rolling(roll_win).mean()
btc_2017.shape
article_topics.shape
# final = pd.DataFrame()
# for t_id, t_name in topic_num_to_name.values:
# # print(t_id, t_name)
# topic_id = [t_id]
# one_topic = news[article_topics.loc[:,topic_id].sum(axis=1)>0]
# final[t_name] = senti(one_topic, feat='sentiment')
# final.replace(np.nan, 0, inplace=True)
# final.to_csv('data/tmp.gz', compression='gzip')
final = pd.read_csv('data/tmp.gz', index_col=0, parse_dates=['date'])
final.replace(np.nan, 0, inplace=True)
def corr(df1, df2):
n = len(df1)
v1, v2 = df1.values, df2.values
sums = np.multiply.outer(v2.sum(0), v1.sum(0))
stds = np.multiply.outer(v2.std(0), v1.std(0))
return pd.DataFrame((v2.T.dot(v1) - sums / n) / stds / n,
df2.columns, df1.columns)
imshow(corr(final, btc_2017.pct_change().replace(np.nan, 0)), cmap='viridis')
colorbar()
yticks(arange(2), btc_2017.columns)
xticks(arange(17), final.columns, rotation=90)
title("Correlation between Bitcoin Price/Volume Change and News Sentiment per Topic")
# savefig("fig/corr.jpeg", dpi=200, bbox_inches='tight')
fig = figure()
ax = fig.add_subplot(111)
c.resample('1D').mean().plot(alpha=0.3, legend=False, ax=ax)
c.resample('7D').mean().plot(alpha=0.5, legend=False, ax=ax)
c.resample('1M').mean().plot(legend=False, ax=ax)
ax.legend(['day Price','7 day Price','30 day Price'], loc=0)
ax.semilogy()
# ylabel("Price in USD")
ax2 = ax.twinx()
security.groupby('date').median().polarity.resample('7D').mean().pct_change().plot(ax=ax2,c='k')
ylabel('Median Polarity of Security News')
# d.resample('1D').mean().plot(c='k', alpha=0.1, legend=False, ax=ax2)
# d.resample('7D').mean().plot(c='k', alpha=0.5, legend=False, ax=ax2)
# d.resample('1M').mean().plot(c='k', legend=False, ax=ax2)
# ax2.legend(['day Volume','7 day Volume','30 day Volume'], loc=4)
# ax2.semilogy()
title("2017 Bitcoin Price in USD verse median polarity of WSJ News on Security")
# savefig('fig/BTC_Polarity.jpeg', dpi=200, bbox_inches='tight')
c.resample('7D').mean().pct_change().plot()
# vlines(c.index[(c.pct_change()) > 0.08], 0, c.max(), colors='r', alpha=0.5)
# vlines(c.index[(c.pct_change()) < -0.08], 0, c.max(), colors='g', alpha=0.5)
# vlines(mark.index[mark.values>0.12], 0, c.max(), colors='b')
# vlines(mark.index[mark.values<-0.03], 0, c.max(), colors='k')
fig = figure()
ax = fig.add_subplot(111)
# c.resample('1D').mean().plot(alpha=0.3, legend=False, ax=ax)
c.resample('7D').mean().pct_change().plot(alpha=0.5, ax=ax)
# d.resample('7D').mean().pct_change().plot(alpha=0.5, ax=ax)
# c.resample('1M').mean().plot(legend=False, ax=ax)
ax.legend(['7 day Price change'], loc=0)
# ax.semilogy()
# ylabel("Price in USD")
ax2 = ax.twinx()
security.groupby('date').median().polarity.resample('7D').mean().pct_change().plot(ax=ax2,c='k')
ax2.legend(['Median Polarity of Security News'])
# d.resample('1D').mean().plot(c='k', alpha=0.1, legend=False, ax=ax2)
# d.resample('7D').mean().plot(c='k', alpha=0.5, legend=False, ax=ax2)
# d.resample('1M').mean().plot(c='k', legend=False, ax=ax2)
# ax2.legend(['day Volume','7 day Volume','30 day Volume'], loc=4)
# ax2.semilogy()
title("2017 Bitcoin Price Change in USD verse median polarity of WSJ News on Security")
# savefig('fig/BTC_change.jpeg', dpi=200, bbox_inches='tight')
security.groupby('date').mean().polarity.plot()
security.groupby('date').mean().polarity.plot()
# mark = security.groupby('date').median().sentiment.rolling(7)
mark = security.groupby('date').mean().polarity
key_word= "([Bb]it[Cc]oin)|([Bb]lock[-]?chain)|([Cc]rypto[-]?curren)"
b = news[news.content.str.contains(key_word)]
b.groupby('date').mean().polarity.plot()
mark = pytrends.interest_over_time()[pytrends.interest_over_time().index.isin(c.index)]
c= c.rolling(7).mean()
ax = figure().add_subplot(111)
c.plot(logy=True, ax=ax)
vlines(c.index[(c.pct_change()) > 0.05], 0, c.max(), colors='r')
vlines(c.index[(c.pct_change()) < -0.05], 0, c.max(), colors='g')
vlines(mark.index[mark.BitCoin.pct_change() > 0.2] , 0, c.max(), colors='b')
vlines(mark.index[mark.BitCoin.pct_change() < -0.2], 0, c.max(), colors='k')
ax2 = ax.twinx()
mark.plot(c='y', logy=True, ax=ax2)
kw_list = ["BitCoin"]
pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='')
btc_trend = pytrends.interest_over_time()[pytrends.interest_over_time().index.isin(c.index)]
ax = figure().add_subplot(111)
title("Correlation between Bitcoin Price and Google Trend, with coefficient of 0.945")
btc_trend.plot(logy=True, c='b', alpha=0.5, ax=ax)
vlines(btc_trend.index[btc_trend.BitCoin.pct_change()>0.3] , 0, btc_trend.BitCoin.max(),
alpha=0.3, colors='r')
vlines(btc_trend.index[btc_trend.BitCoin.pct_change()<-0.3] , 0, btc_trend.BitCoin.max(),
alpha=0.3, colors='g')
ax.legend(['7 day Bitcoin Price'])
ylabel('Bitcoin Price in USD')
ax2 = ax.twinx()
c.plot(logy=True, ax=ax2, c='k')
ylabel('Google Trend')
ax2.legend(['7 day Google Trend about Bitcoin'], loc=4)
vlines(c.index[(c.pct_change()) > 0.1], 0, c.max(),
alpha=0.5, linestyles='dashed', colors='r')
vlines(c.index[(c.pct_change()) < -0.1], 0, c.max(),
alpha=0.5, linestyles='dashed', colors='g')
savefig('fig/google.jpeg', dpi=200, bbox_inches='tight')
pd.concat([btc_trend.BitCoin, btc_2017.loc[btc_trend.index, :]],axis=1).corr()
c.index[(c.pct_change() > 0) & (security.groupby('date').median().sentiment > 0 )]
c.index[(c.pct_change() < 0) & (security.groupby('date').median().sentiment > 0 )]
hist(security.groupby('date').median().polarity)
Word vector
Corpus
established
customized
codecs.open(): Open an encoded file using the given mode and return a wrapped version providing transparent encoding/decoding.
While Word2Vec works on the intuition that the word representation should be good enough to predict the surrounding words, the underlying intuition of Doc2Vec is that the document representation should be good enough to predict the words in the document.
Distributed Bag of Words and The skip gram model. One involves predicting the context words using a centre word, while the other involves predicting the word using the context words.
wiki300 = pd.read_csv('data/wiki_vec300.gz', sep=' ', header=None)
ap300 = pd.read_csv('data/apnews_vec300.gz', sep=' ', header=None)
print(wiki300.shape, ap300.shape)
# explore the correlation between the news
def corr_heatmap(df, n=100, random=True, absolute_bounds=True):
'''Plot a correlation heatmap for the entire dataframe'''
if random:
heatmap = go.Heatmap(
z=df.sample(n).T.corr(method='pearson').as_matrix(),
x=df.columns,
y=df.columns,
colorbar=dict(title='Pearson Coefficient'),
)
else:
heatmap = go.Heatmap(
z=df.head(n).T.corr(method='pearson').as_matrix(),
x=df.columns,
y=df.columns,
colorbar=dict(title='Pearson Coefficient'),
)
layout = go.Layout(title='Pearson Correlation')
if absolute_bounds:
heatmap['zmax'] = 1.0
heatmap['zmin'] = -1.0
fig = go.Figure(data=[heatmap], layout=layout)
py.iplot(fig)
# # compare how representative the vector size will be
# vec30 = pd.read_csv('data/cache/wsj_vec30.gz', sep=' ', header=None)
# vec60 = pd.read_csv('data/cache/wsj_vec60.gz', sep=' ', header=None)
# vec300 = pd.read_csv('data/cache/wiki_vec300.gz', sep=' ', header=None)
# vec30_corr = vec30.corr()
# vec60_corr = vec60.corr()
# vec300_corr = vec300.corr()
# fig = figure(figsize=(18,6))
# ax1 = fig.add_subplot(1,3,1)
# ax1.imshow(vec30_corr, cmap='viridis')
# pct = (np.count_nonzero(np.where(vec30_corr>0.5)[0])-29)/(30*30-30)*100
# ax1.set_title('Correlated Feature Pairs: %0.2f%%' % pct)
# ax2 = fig.add_subplot(1,3,2)
# ax2.imshow(vec60_corr, cmap='viridis')
# pct = (np.count_nonzero(np.where(vec60_corr>0.5)[0])-59)/(60*60-60)*100
# ax2.set_title('Correlated Feature Pairs: %0.2f%%' % pct)
# ax3 = fig.add_subplot(1,3,3)
# ax3.imshow(vec300_corr, cmap='viridis')
# pct = (np.count_nonzero(np.where(vec300_corr>0.5)[0])-299)/(300*300-300)*100
# ax3.set_title('Correlated Feature Pairs: %0.2f%%' % pct)
# # savefig('fig/feature_dim.jpeg', dpi=200, bbox_inches='tight')
# n = 300
# vec30_Tcorr = vec30.head(n).T.corr()
# vec60_Tcorr = vec60.head(n).T.corr()
# vec300_Tcorr = vec300.head(n).T.corr()
# fig = figure(figsize=(18,6))
# ax1 = fig.add_subplot(1,3,1)
# im = ax1.imshow(vec30_Tcorr, cmap='plasma')
# pct = (np.count_nonzero(np.where(vec30_Tcorr<0.5)[0])-n+1)/n/(n-1)*100
# ax1.set_title('Uncorrelated News: %0.2f%%' % pct)
# ax2 = fig.add_subplot(1,3,2)
# im = ax2.imshow(vec60_Tcorr, cmap='plasma')
# pct = (np.count_nonzero(np.where(vec60_Tcorr<0.5)[0])-n+1)/n/(n-1)*100
# ax2.set_title('Uncorrelated News: %0.2f%%' % pct)
# ax3 = fig.add_subplot(1,3,3)
# im = ax3.imshow(vec300_Tcorr, cmap='plasma')
# pct = (np.count_nonzero(np.where(vec300_Tcorr<0.5)[0])-n+1)/n/(n-1)*100
# ax3.set_title('Uncorrelated News: %0.2f%%' % pct)
# fig.colorbar(im, ax =[ax1, ax2, ax3])#, orientation='horizontal', aspect=50)
# # savefig('fig/feature_detail.jpeg', dpi=200, bbox_inches='tight')
# # calculate distance
# from scipy.spatial.distance import pdist, squareform
# distances = pdist(vec30.head(300).values, metric='euclidean')
# dist_matrix = squareform(distances)
# heatmap for explore
corr_heatmap(wiki300, n=300, random=False)
# similar news will have higher correlation and unlike news will have lower correlation
# the strips means that news almost have no correlation with all others.
# Upon close check, these are the general topics, i.e. 52, 94, 230
wiki_Tcorr = wiki300.head(300).T.corr()
pd.options.display.max_colwidth = 200
# unlike pairs
mask = (wiki_Tcorr < 0.1) # & (wiki_Tcorr > 0.13)
pairs = np.where(mask)
# Counter(pairs[0])
scatter(pairs[0], pairs[1], c=wiki_Tcorr.values[mask], cmap='plasma')#, vmax=1)
axis('equal')
xlim([0,300])
colorbar()
ind = 2
print(pairs[0][ind], '\t', news.content[pairs[0][ind]])
print(pairs[1][ind], '\t', news.content[pairs[1][ind]])
for ind, n in Counter(pairs[0]).most_common(4):
print('news id:', ind ,'\t' ,n ,'times','\t' , news.content[ind])
# like pairs
like_mask = (wiki_Tcorr<1) & (wiki_Tcorr>0.9)
like_pairs = np.where(like_mask)
Counter(like_pairs[0])
scatter(like_pairs[0], like_pairs[1], c=wiki_Tcorr.values[like_mask])#, vmin=min(wiki_Tcorr.min()))
axis('equal')
xlim([0,300])
colorbar()
ind = 1
print(like_pairs[0][ind], '\t', news.content[like_pairs[0][ind]])
print(like_pairs[1][ind], '\t', news.content[like_pairs[1][ind]])
# very good representation
# Heatmap for save
figure(figsize=(8,8))
scatter(pairs[0], pairs[1], marker='x', alpha=0.5, c=wiki_Tcorr.values[mask], cmap='plasma')#, vmax=1)
scatter(like_pairs[0], like_pairs[1], marker='o', facecolors=None, alpha=0.5, c=wiki_Tcorr.values[like_mask], cmap='plasma')#, vmin=min(wiki_Tcorr.min()))
legend(['unlike pairs','like pairs'], loc=1)
imshow(wiki_Tcorr)
colorbar()
title("Pearson Correlation bewteen first 300 News")
# savefig('fig/wiki300.jpeg', dpi=200, bbox_inches='tight')
TFIDF model: not that good
n=300
tfidf_Tcorr = transformed_data.head(n).T.corr()
# unlike pairs
mask = (tfidf_Tcorr < 0.1) & (tfidf_Tcorr > -0.1)
pairs = np.where(mask)
# Counter(pairs[0])
scatter(pairs[0], pairs[1], c=tfidf_Tcorr.values[mask], cmap='plasma')#, vmax=1)
axis('equal')
xlim([0,300])
colorbar()
ind = 5000
print(pairs[0][ind], '\t', news.content[pairs[0][ind]])
print(pairs[1][ind], '\t', news.content[pairs[1][ind]])
for ind, n in Counter(pairs[0]).most_common(4):
print('news id:', ind ,'\t' ,n ,'times','\t' , news.content[ind])
# like pairs
like_mask = (tfidf_Tcorr<1) & (tfidf_Tcorr>0.99)
like_pairs = np.where(like_mask)
# Counter(like_pairs[0])
scatter(like_pairs[0], like_pairs[1], c=tfidf_Tcorr.values[like_mask], cmap='plasma')#, vmax=1)
axis('equal')
xlim([0,300])
colorbar()
ind = 2
print(like_pairs[0][ind], '\t', news.content[like_pairs[0][ind]])
print(like_pairs[1][ind], '\t', news.content[like_pairs[1][ind]])
# Heatmap for save
figure(figsize=(8,8))
scatter(pairs[0], pairs[1], marker='x', alpha=0.5, c=tfidf_Tcorr.values[mask], cmap='plasma')#, vmax=1)
scatter(like_pairs[0], like_pairs[1], marker='o', facecolors=None, alpha=0.5, c=tfidf_Tcorr.values[like_mask], cmap='plasma')#, vmin=min(wiki_Tcorr.min()))
legend(['unlike pairs','like pairs'], loc=1)
imshow(tfidf_Tcorr, cmap='viridis')
colorbar()
title("Pearson Correlation bewteen First 300 News from Topic Modeling")
savefig('fig/tfidf.jpeg', dpi=200, bbox_inches='tight')
from sklearn.decomposition import PCA
%time m_pca = PCA(n_components=30, random_state=1).fit(wiki300)
%time pca_transformed_data = m_pca.transform(wiki300)
pca_transformed_data = pd.DataFrame(pca_transformed_data)
scores = m_pca.score_samples(wiki300)
print("pca_transformed_data.shape", pca_transformed_data.shape)
pca_topics = pd.DataFrame(scores.round() % 30)
pca_topics.columns = ["Topic_Num"]
pca_topics["Date"] = news.date
topic_id = [0]
news.content[pca_topics.Topic_Num == topic_id].sample(5)
unsupervised
dimension reduction
remove features
verification
or
from hdbscan import HDBSCAN
from sklearn.decomposition import PCA
from sklearn.cluster import estimate_bandwidth, MeanShift
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.manifold import TSNE
def exam(X, label, params=dict()):
try:
scatter(X.loc[:,0], X.loc[:,1], c=label, **params)
colorbar()
except:
scatter(X[:,0], X[:,1], c=label, **params)
colorbar()
return
# parameters
n = 10000 # wiki300.shape[0]
dim = 10
plot_param = dict({'alpha': 0.5 , 's': 10 , 'cmap':cm.Accent})
# data
X = wiki300.head(n)
y = pd.read_csv('data/sentiment.gz').head(n)
y_DB = np.load('y_DB.temp.npy')[:n]
# y_MS = np.load('y_MS.temp.npy')[:n]
# 1.A
# m_DB = hdbscan.HDBSCAN()
# %time y_DB = m_DB.fit_predict(X)
Counter(y_DB)
# mask data
mask = (y_DB == -1) | (y_DB == 0)
# news.content.head(n)[y_DB==1]
# news.content.head(n)[y_DB==3]
news.content.head(n)[y_DB==-1].sample(2)
news.content.head(n)[y_DB==0].sample(2)
exam(X, y.subjectivity, plot_param)
# # 1.B
# %time m_MS = MeanShift(estimate_bandwidth(X[mask], quantile=0.2, n_samples=500))
# %time y_MS = m_MS.fit_predict(X[mask], y=news.subjectivity[mask])
# len(unique(y_MS))
# Counter(y_MS).most_common(10)
mask = (y.subjectivity>0) & mask
sns.distplot(y.subjectivity[mask])
# 2.A reduce dimension
m_pca = PCA(n_components=dim)
X_pca = m_pca.fit_transform(X[mask], y.subjectivity[mask])
exam(X_pca[:,[1,2]], y.subjectivity[mask], plot_param)
_ = np.column_stack([X_pca, y.subjectivity[mask].values])
reduced = pd.DataFrame(_, columns=[str(i) for i in range(dim+1)])
# sns.pairplot(data=,
# vars=[str(i) for i in range(dim)], hue='%d' % dim)
# sns.pairplot(reduced.sample(100), vars=['0','1','2'], hue='3')
# m_pca = PCA(n_components=dim)
# m_tsne = TSNE(n_components=3)
# # supervised
# m_RF = RandomForestClassifier()
# m_KNN = KNeighborsClassifier(n_neighbors=dim)
# m_MLP = MLPClassifier()
# m_GP = GaussianProcessClassifier()
# # unsupervised, fit_predict labels
# m_DB = DBSCAN(min_samples=10) # DBSCAN: mark outliers
# m_MS = MeanShift(estimate_bandwidth(X))
# y_DB = m_DB.fit_predict(X)
# Counter(y_DB)
# y_MS = m_MS.fit_predict(X)
# Counter(y_MS)
# exam(X, y_DB)
# exam(X, y_MS)
# X_pca = m_pca.fit_transform(X)
# exam(X_pca, y_MS)
# X_tsne = m_tsne.fit_transform(X)
# exam(X_tsne[:,[1,2]], y_MS)
import gensim.models as g
import codecs, gzip
news_split = [ x.strip().split() for x in codecs.getreader('utf-8')(gzip.open('data/wsj_content.gz'), errors='replace') ]
wsj30 = 'model/model30-10.bin'
wsj60 = 'model/model60-15.bin'
wiki_m = '/Users/domi/Desktop/wordvector/enwiki_dbow/doc2vec.bin'
ap_m = '/Users/domi/Desktop/wordvector/apnews_dbow/doc2vec.bin'
m = g.Doc2Vec.load(wiki_m)
m.corpus_count
len(m.wv.vocab)
m.docvecs.count
m.vector_size
m.wv.similar_by_word('bitcoin')
ind = 1
news.content[ind]
news_split[ind] == news.content[ind].split()
vec = m.infer_vector(news_split[ind])
vec.shape
m.wv.similar_by_vector(vec)
word = m.wv.word_vec('photo')
corrcoef(word, vec)
len(m.docvecs)
words = "king queen man".split()
len_before = len(m.docvecs) #number of docs
#word vectors for king, queen, man
w_vec0 = m[words[0]]
w_vec1 = m[words[1]]
w_vec2 = m[words[2]]
new_vec = m.infer_vector(words)
len_after = len(m.docvecs)
print(np.array_equal(m[words[0]], w_vec0)) # True
print(np.array_equal(m[words[1]], w_vec1)) # True
print(np.array_equal(m[words[2]], w_vec2)) # True
print(len_before == len_after) #True
w_vec3 = m['woman']
dist(w_vec0, w_vec1)
dist(w_vec2, w_vec3)
corrcoef(w_vec0-w_vec1, w_vec2-w_vec3)
corrcoef(w_vec0, w_vec2)
m_pca = PCA(n_components=2)
w_vecs = m_pca.fit_transform([w_vec0, w_vec1, w_vec2, w_vec3])
scatter(w_vecs[:,0],w_vecs[:,1])